Introduction

The purpose of this workflow is to look at the genes that have differential states across strains at the TSS.

Here we look at which states are represented at the TSS, and which genes have variation there.

library(here);library(gprofiler2);library(pheatmap);library(fgsea)
is.interactive = FALSE
#is.interactive = TRUE
all.code.dir <- list.files(here("Code"), full.names = TRUE)
for(i in 1:length(all.code.dir)){
    all.fun <- list.files(all.code.dir[i], full.names = TRUE)
    for(j in 1:length(all.fun)){source(all.fun[j])}
}

Load data

transcript.info <- readRDS(here("Data", "RNASeq", "RNASeq_gene_info.RData"))
chrom.mats <- readRDS(here("Results", "ChromHMM", "008_states_C", "Chromatin.States.Gene.Coords.RDS"))
ch.coef <- readRDS(here("Results", "chQTL", "chQTL.coef.RDS"))

Find the states at the TSS of each transcript

get_tss_states <- function(chrom.mat){
    if(length(chrom.mat) == 1){return(rep(NA, 9))}
    tss.pos <- get.nearest.pt(as.numeric(rownames(chrom.mat)), 0)
    return(chrom.mat[tss.pos,,drop=FALSE])
}

#get all the states right at the TSS
tss.states <- t(sapply(chrom.mats, get_tss_states))

#tally which states are present there
state.counts <- t(apply(tss.states, 1, function(y) sapply(1:8, function(x) length(which(y == x)))))
colnames(state.counts) <- 1:8

#find all the genes with variation
var.genes <- which(apply(state.counts, 1, function(x) length(which(x != 0)) > 1))

There are 4448 out of 14903 total genes with variation at the TSS. The figure below shows genes in rows and states in columns. Each cell contains the number of strains that had each state at the TSS for that gene. State 7 is the most abundant state at the TSS even for genes with variation at this position. State 1 is the next most abundant.

var.mat <- state.counts[var.genes,]
pheatmap(var.mat, show_rownames = FALSE)

The figure below shows which states tend to co-occur at the TSS. Most of the states are negatively correlated with each other. State 7 tends to co-occur with state 8, but is very negatively correlated with state 1.

States 1, 2, and 3, all negative states, tend to co-occur, and states 4 and 5 tend to co-occur.

cor.mat <- var.mat
cor.mat[which(var.mat > 0)] <- 1
state.cor <- cor(cor.mat)
diag(state.cor) <- NA
imageWithText(state.cor, split.at.vals = TRUE, col.scale = c("blue", "brown"),
grad.dir = "ends")

state.decomp <- plot.decomp(cor.mat, pc = 8, mean.center = FALSE)

plot(state.decomp$v[,1:2], pch = 16, 
col = colors.from.values(1:8, use.pheatmap.colors = TRUE), xlab = "PC1", ylab = "PC2")
text(state.decomp$v[,1], state.decomp$v[,2], 1:8, pos = 4)

pairs(state.decomp$u[,1:5])
state.pairs <- pair.matrix(1:8)
state.pair.names <- apply(state.pairs, 1, function(x) paste(x, collapse = "_"))
has_pair  <- function(state.mat, state1, state2){
    return(which(apply(state.mat, 1, function(x) x[state1] > 0 && x[state2] > 0)))
}

with_state_pair <- apply(state.pairs, 1, function(x) has_pair(var.mat, x[1], x[2]))
names(with_state_pair) <- state.pair.names
state.pair.count <- sapply(with_state_pair, length)
pair.count.order <- order(state.pair.count)
barplot(state.pair.count[pair.count.order], las = 2, horiz = TRUE, cex.names = 1)

Chromatin State Enrichment

We looked at functional enrichments for genes with each pair of states present at the TSS.

Some of these genes have more than two states at the TSS, so the groups are overlapping somewhat, but using more categories was a bit too disorganized.

enrich.file <- here("Results", "chQTL", "State_Pair_Enrichment.RDS")

if(!file.exists(enrich.file)){
    state_enrich <- vector(mode = "list", length = length(state.pair.count))
    names(state_enrich) <- names(state.pair.count)
    for(i in 1:length(with_state_pair)){
        enrichment <- gost(names(with_state_pair[[i]]), organism = "mmusculus", sources = "GO")
        if(!is.null(enrichment)){
            state_enrich[[i]] <- enrichment
        }
    }
    saveRDS(state_enrich, enrich.file)
}else{
    state_enrich <- readRDS(enrich.file)
}

for(i in 1:length(state_enrich)){
    cat("###", state.pair.names[i], "\n")
    plot.enrichment(state_enrich[[i]], 20, plot.label = state.pair.names[i],
    order.by = "p_value", max.term.size = 500)
    cat("\n\n")
}

1_2

1_3

2_3

1_4

2_4

3_4

1_5

2_5

3_5

4_5

1_6

2_6

3_6

4_6

5_6

1_7

2_7

3_7

4_7

5_7

6_7

1_8

2_8

3_8

4_8

5_8

6_8

7_8

state_gene_names <- lapply(with_state_pair, function(x) transcript.info[match(names(x), transcript.info[,"ensembl_gene_id"]),"external_gene_name"])

We looked to see if any of the genes with particular state combinations at the TSS were enriched in the mismatched genes. None of the groups were enriched.

ch.lod <- readRDS(here("Results", "chQTL", "chQTL.lod.RDS"))
e.lod <- readRDS(here("Results", "eQTL", "eQTL.lod.RDS"))

max.ch.lod <- t(sapply(ch.lod, function(x) if(length(x) > 0){apply(x, 2, function(y) max(y, na.rm = TRUE))}else{rep(NA, 4)}))
rownames(max.ch.lod) <- rownames(e.lod)

lod.diff <- e.lod - max.ch.lod

ordered.genes <- sort(lod.diff[,1], decreasing = TRUE)
plot(ordered.genes)

state_gene_list <- lapply(with_state_pair, function(x) names(x))
lod.diff.enrich <- fgsea::fgseaMultilevel(pathways = state_gene_list, 
  stats = ordered.genes, minSize=15, maxSize=600, gseaParam = 1)
## Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (0.01% of the list).
## The order of those tied genes will be arbitrary, which may produce unexpected results.
plot.gsea.results(lod.diff.enrich)

## NULL